library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.1 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(openintro) # a bunch of data sets
## Please visit openintro.org for free statistics materials
##
## Attaching package: 'openintro'
## The following object is masked from 'package:ggplot2':
##
## diamonds
## The following objects are masked from 'package:datasets':
##
## cars, trees
library(broom)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Allow you to model and predict numeric and categorical variables using mulitple input variables. Learn how to fit, visualize, and interptret the models.
Extend simple linear regression to a number fo variabels which can be both numeric and categorical Logistic regression allows us to model a binary response.
Car example Want to look at fuel efficiency over time while controlling for engine size Use a parallel slopes model
We will fit a parallel slopes model using lm(). In addition to the data argument, lm() needs to know which variables you want to include in your regression model, and how you want to include them. It accomplishes this using a formula argument. A simple linear regression formula looks like y ~ x, where y is the name of the response variable, and x is the name of the explanatory variable. Here, we will simply extend this formula to include multiple explanatory variables. A parallel slopes model has the form y ~ x + z, where z is a categorical explanatory variable, and x is a numerical explanatory variable.
#mulitple regression
glimpse(mtcars)
## Observations: 32
## Variables: 11
## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2,…
## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4,…
## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140…
## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 18…
## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92,…
## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.1…
## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.…
## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,…
## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4,…
## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1,…
mod <- lm(hwy ~ displ + factor(year), data = mpg) # need to make sure that r recognizes year as a categorical variable which is why you must use factor()
mod_aug <- augment(mod)
# Plot lines with geom_line for predicted values
ggplot(data = mod_aug, mapping = aes(y = hwy, x = displ, color = factor.year.)) +
geom_point(alpha = 0.3) +
geom_line(aes(y = .fitted), lwd = 1)
Parallel slopes models are so-named because we can visualize these models in the data space as not one line, but two parallel lines.
Most often our goal is to inpret the coefficients. What does the model tell us about the relationship between engine size and fuel economy in the context of the year the cars were manufactured?
Coefficients:
(Intercept) displ factor(year)2008
35.276 -3.611 1.402
35.28 is the expected fuel economy of a car from 1999 with an engine size of 0 liters. Of course, there is no car with an engine of 0 liters so this is just an interpretation. the model tells us that for every additional liter in engine size there is a decrease of fuel economy by 3.611. 1.4 is the average difference between 1999 and 2008 in terms of mpg for the same type of car. Other notes
data(marioKart)
glimpse(marioKart)
## Observations: 143
## Variables: 12
## $ ID <dbl> 150377422259, 260483376854, 320432342985, 28040522467…
## $ duration <int> 3, 7, 3, 3, 1, 3, 1, 1, 3, 7, 1, 1, 1, 1, 7, 7, 3, 3,…
## $ nBids <int> 20, 13, 16, 18, 20, 19, 13, 15, 29, 8, 15, 15, 13, 16…
## $ cond <fct> new, used, new, new, new, new, used, new, used, used,…
## $ startPr <dbl> 0.99, 0.99, 0.99, 0.99, 0.01, 0.99, 0.01, 1.00, 0.99,…
## $ shipPr <dbl> 4.00, 3.99, 3.50, 0.00, 0.00, 4.00, 0.00, 2.99, 4.00,…
## $ totalPr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.9…
## $ shipSp <fct> standard, firstClass, firstClass, standard, media, st…
## $ sellerRate <int> 1580, 365, 998, 7, 820, 270144, 7284, 4858, 27, 201, …
## $ stockPhoto <fct> yes, yes, no, yes, yes, yes, yes, yes, yes, no, yes, …
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1,…
## $ title <fct> "~~ Wii MARIO KART & WHEEL ~ NINTENDO Wii ~ BRAND…
# fit parallel slopes
mod_kart <- lm(totalPr ~ wheels + cond, data = marioKart)
mod_kart_aug <- augment(mod_kart) # %>% filter(totalPr < 100)
glimpse(mod_kart_aug)
## Observations: 143
## Variables: 10
## $ totalPr <dbl> 51.55, 37.04, 45.50, 44.00, 71.00, 45.00, 37.02, 53.9…
## $ wheels <int> 1, 1, 1, 1, 2, 0, 0, 2, 1, 1, 2, 2, 2, 2, 1, 0, 1, 1,…
## $ cond <fct> new, used, new, new, new, new, used, new, used, used,…
## $ .fitted <dbl> 47.88342, 48.72916, 47.88342, 47.88342, 58.09954, 37.…
## $ .se.fit <dbl> 3.532896, 2.696310, 3.532896, 3.532896, 3.375000, 5.2…
## $ .resid <dbl> 3.666579, -11.689162, -2.383421, -3.883421, 12.900457…
## $ .hat <dbl> 0.02093127, 0.01219196, 0.02093127, 0.02093127, 0.019…
## $ .sigma <dbl> 24.504954, 24.486658, 24.506118, 24.504709, 24.482054…
## $ .cooksd <dbl> 1.640983e-04, 9.543509e-04, 6.933998e-05, 1.840819e-0…
## $ .std.resid <dbl> 0.15174745, -0.48163062, -0.09864186, -0.16072186, 0.…
# Plot lines with geom_line for predicted values
ggplot(data = mod_kart_aug, mapping = aes(y = totalPr, x = wheels, color = cond)) +
geom_point(alpha = 0.3) +
geom_line(aes(y = .fitted), lwd = 1) +
ylim(25,80)
## Warning: Removed 2 rows containing missing values (geom_point).
Interpretation of slope coef - For each additional wheel, the expected price of a MarioKart increases by $7.23 regardless of whether it is new or used.
data(babies)
glimpse(babies)
## Observations: 1,236
## Variables: 8
## $ case <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,…
## $ bwt <int> 120, 113, 128, 123, 108, 136, 138, 132, 120, 143, 140,…
## $ gestation <int> 284, 282, 279, NA, 282, 286, 244, 245, 289, 299, 351, …
## $ parity <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ age <int> 27, 33, 28, 36, 23, 25, 33, 23, 25, 30, 27, 32, 23, 36…
## $ height <int> 62, 64, 64, 69, 67, 62, 62, 65, 62, 66, 68, 64, 63, 61…
## $ weight <int> 100, 135, 115, 190, 125, 93, 178, 140, 125, 136, 120, …
## $ smoke <int> 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, …
#visually examine data
ggplot(data = babies, mapping = aes(x = age, y = bwt, color = parity)) + geom_point()
## Warning: Removed 2 rows containing missing values (geom_point).
# build model for birthweight as a fucntion of mothers age and whether or not it is her first child
lm(bwt ~ age + parity, data = babies)
##
## Call:
## lm(formula = bwt ~ age + parity, data = babies)
##
## Coefficients:
## (Intercept) age parity
## 118.27782 0.06315 -1.65248
#visually examine data
ggplot(data = babies, mapping = aes(x = gestation, y = bwt, color = factor(smoke))) + geom_point(alpha = .6)
## Warning: Removed 13 rows containing missing values (geom_point).
# build a model for birthweight as a function of the length of gestation and the mother's smoking status
lm(bwt ~ gestation + smoke, data = babies)
##
## Call:
## lm(formula = bwt ~ gestation + smoke, data = babies)
##
## Coefficients:
## (Intercept) gestation smoke
## -0.9317 0.4429 -8.0883
Residuals are the differences between the actual value and the predicted(fitted) value from model (line of best fit)
Model fitting procedure minimizes the residuals over the entire data space.
Coef of determination (R^2) is the same as simple linear regression. It measures the percentage of the variability in the response variable that is explained by the model. To compute this, we define R2=1−SSE/SST
The adjusted R^2 is used to determine model performcae for multiple linear regression. The only differece is that a penalty is applied as the number of explanatory variables increases.
R2adj=1−(SSE/SST)⋅(n−1/n−p−1)
We can see both measures in the output of the summary() function on our model object.
Augment(mod) returns a data frame with the original values along with other calculations from the model inclduing the fitted values and errors. This function is used in the tidyverse from the broom package.
Making out of sample predictions - you can use augment(mod, newdata)
# See how the r^2 values change just by adding an extra variable called noise to the marioKart dataset
# R^2 and adjusted R^2
summary(mod_kart)
##
## Call:
## lm(formula = totalPr ~ wheels + cond, data = marioKart)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.719 -6.462 -2.513 0.487 267.565
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.6673 5.2795 7.135 4.78e-11 ***
## wheels 10.2161 2.6740 3.821 0.0002 ***
## condused 0.8457 4.5855 0.184 0.8539
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.42 on 140 degrees of freedom
## Multiple R-squared: 0.1091, Adjusted R-squared: 0.09638
## F-statistic: 8.573 on 2 and 140 DF, p-value: 0.0003075
# add random noise
mario_kart_noisy <- marioKart %>%
mutate(noise = rnorm(nrow(marioKart)))
# compute new model
mod_kart_2 <- lm(totalPr ~ wheels + cond + noise, data = mario_kart_noisy)
# new R^2 and adjusted R^2
summary(mod_kart_2)
##
## Call:
## lm(formula = totalPr ~ wheels + cond + noise, data = mario_kart_noisy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.383 -6.528 -2.843 0.476 267.855
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.1682 5.3096 7.189 3.67e-11 ***
## wheels 10.0273 2.6831 3.737 0.000271 ***
## condused 0.3991 4.6130 0.087 0.931187
## noise -1.9499 2.1011 -0.928 0.354977
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24.43 on 139 degrees of freedom
## Multiple R-squared: 0.1146, Adjusted R-squared: 0.09548
## F-statistic: 5.997 on 3 and 139 DF, p-value: 0.0007144
# Make predictions
augment(mod_kart_2)
## # A tibble: 143 x 11
## totalPr wheels cond noise .fitted .se.fit .resid .hat .sigma
## <dbl> <int> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 51.6 1 new -1.31 50.8 4.70 0.793 0.0370 24.5
## 2 37.0 1 used 1.15 46.4 3.72 -9.32 0.0231 24.5
## 3 45.5 1 new 0.923 46.4 3.88 -0.896 0.0252 24.5
## 4 44 1 new 0.280 47.6 3.54 -3.65 0.0210 24.5
## 5 71 2 new 0.702 56.9 3.63 14.1 0.0221 24.5
## 6 45 0 new -0.139 38.4 5.35 6.56 0.0479 24.5
## 7 37.0 0 used 0.241 38.1 3.52 -1.08 0.0208 24.5
## 8 54.0 2 new 0.712 56.8 3.64 -2.84 0.0222 24.5
## 9 47 1 used 0.0808 48.4 2.72 -1.44 0.0124 24.5
## 10 50 1 used -1.07 50.7 3.43 -0.688 0.0197 24.5
## # … with 133 more rows, and 2 more variables: .cooksd <dbl>,
## # .std.resid <dbl>
Including an interaction term in a model is easy—we just have to tell lm() that we want to include that new variable. The use of the colon (:) here means that the interaction between x and z will be a third term in the model.
# interaction term
# lm(y ~ x + z + x:z, data = mydata)
# include interaction
lm(totalPr ~ cond + duration + cond:duration, data = marioKart)
##
## Call:
## lm(formula = totalPr ~ cond + duration + cond:duration, data = marioKart)
##
## Coefficients:
## (Intercept) condused duration
## 58.268 -17.564 -1.966
## condused:duration
## 3.305
Interaction allows the slope of the regression line in each group to vary. In this case, this means that the relationship between the final price and the length of the auction is moderated by the condition of each item.
Interaction models are easy to visualize in the data space with ggplot2 because they have the same coefficients as if the models were fit independently to each group defined by the level of the categorical variable. In this case, new and used MarioKarts each get their own regression line. To see this, we can set an aesthetic (e.g. color) to the categorical variable, and then add a geom_smooth() layer to overlay the regression line for each color.
# interaction plot
ggplot(marioKart, aes(y = totalPr, x = duration, color = cond)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
ylim(30,70)
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
# simple linear regression plot
ggplot(marioKart, aes(y = totalPr, x = duration)) +
geom_point() +
geom_smooth(method = "lm", se = 0) +
ylim(30,70)
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
lm(totalPr ~ duration, data = marioKart)
##
## Call:
## lm(formula = totalPr ~ duration, data = marioKart)
##
## Coefficients:
## (Intercept) duration
## 51.4246 -0.4097
# This is an exmaple of simpsons paradox. When you look at both used and new combined, the total price goes down with duration. But looking at them seperately shows what is really happening.
“For every additional thousand dollars of salary for teachers in a particular state, the expected SAT score for a student from that state is about 2 points higher, after controlling for the percentage of students taking the SAT.”
# Fit the model using duration and startPr - two numeric variables, instead of a categorical
lm(totalPr ~ duration + startPr, data = marioKart)
##
## Call:
## lm(formula = totalPr ~ duration + startPr, data = marioKart)
##
## Coefficients:
## (Intercept) duration startPr
## 50.6269 -0.5173 0.1371
# visualizing MLR in 3D
# draw the 3D scatterplot
p <- plot_ly(data = marioKart, z = ~totalPr, x = ~duration, y = ~startPr, opacity = 0.6) %>%
add_markers()
p